xxxxxxxxxximport pandas as pdimport numpy as npimport matplotlib.pyplot as pltimport seaborn as sns%matplotlib inlineimport warningswarnings.filterwarnings('ignore')xxxxxxxxxxfrom sklearn.preprocessing import RobustScalerfrom sklearn.decomposition import PCAfrom sklearn.model_selection import train_test_splitfrom sklearn.preprocessing import LabelEncoderfrom sklearn.metrics import precision_recall_fscore_support, mean_absolute_errorfrom sklearn.metrics import accuracy_score, confusion_matrix, classification_reportfrom sklearn.svm import SVCfrom sklearn.model_selection import GridSearchCVxxxxxxxxxxdata = pd.read_csv('Part3 - vehicle.csv')data.head()xxxxxxxxxxdf = data.copy(deep = True)Let us thake the copy of the original data.xxxxxxxxxxrows, column = df.shapeprint('The dataset contains', rows, 'rows and', column, 'columns.')xxxxxxxxxxdf.info()Except the variable 'class', all are numerical variables.Let us check for the duplicate and null valuesxxxxxxxxxxprint('Duplicated rows: ', df[df.duplicated()].shape[0]) #no duplicate rowsxxxxxxxxxxdf.isnull().sum()It is observed that there are many null values are observed.Instead of dropping them, let us replace null values with medianxxxxxxxxxxfor cols in df.columns: if(cols != 'class'): df[cols] = df[cols].fillna(df[cols].median()) xxxxxxxxxxdf.describe().Txxxxxxxxxxplt.figure(figsize=(25, 20))col = 1for i in df.drop(columns = 'class').columns: plt.subplot(6, 3, col) sns.distplot(df[i], color = 'b', rug =True) col += 11) Columns have data distributed across multiple scales.2) Several columns have distributions that are not unimodal (eg.: distance_circularity, hollows_ratio, elongatedness3) Column skweness_about, skewness_about.1 have data that is right skewed whereas for column skewness_about.2 data is nearly normally distributed.4) Some columns have long right tail (eg.: pr.axis_aspect_ratio) as evident from the above distplot it is highly likely that they will have outliers.xxxxxxxxxxprint(df['class'].value_counts())plt.title('Count of Vehicle Class column')sns.countplot(x = 'class', data = df)Vehicle counts in the 'class': # 429 cars # 218 bus # 199 vansVehicle counts in the 'class':
# 429 cars
# 218 bus
# 199 vansLet us plot the 'Boxplots' to understand about the outliersLet us plot the 'Boxplots' to understand about the outliers
xxxxxxxxxxplt.figure(figsize=(25, 20))col = 1for i in df.drop(columns = 'class').columns: plt.subplot(6, 3, col) sns.boxplot(df[i]) col +=1Outliers: Above Boxplots reveal that there are outliers in 7 different columns. For better clarity we will plot box plots of indivdual columns.xxxxxxxxxxplt.figure(figsize = (18, 5))plt.subplot(1, 4, 1)df['scaled_radius_of_gyration.1'].plot(x ='scaled_radius_of_gyration.1', kind = 'box')plt.title('Boxplot of Scaled Radius of Gyration 1 Column')plt.subplot(1, 4, 2)df['pr.axis_aspect_ratio'].plot(x = 'pr.axis_aspect_ratio', kind = 'box')plt.title('Boxplot of pr.axis_aspect_ratio Column')plt.subplot(1, 4, 3)df.radius_ratio.plot(x = 'radius_ratio', kind = 'box')plt.title('Boxplot of Radius Ratio Column')plt.subplot(1, 4, 4)df.skewness_about.plot(x = 'skewness_about', kind = 'box')plt.title('Boxplot of Skewness About Column')Columns - scaled_radius_of_gyration.1 and pr.axis_aspect_ratio have most outliers amongst the total number of outliers present in multiple columns.xxxxxxxxxxplt.figure(figsize = (15, 5))plt.subplot(1, 3, 1)df.scaled_variance.plot(x = 'scaled_variance', kind = 'box')plt.title('Boxplot of Scaled Variance Column')plt.subplot(1, 3, 2)df['scaled_variance.1'].plot(x = 'scaled_variance.1', kind = 'box')plt.title('Boxplot of Scaled Variance 1 Column')plt.subplot(1, 3, 3)df['skewness_about.1'].plot(x = 'skewness_about.1 ', kind = 'box')plt.title('Boxplot of Skewness About 1 Column')xxxxxxxxxxfor col_name in df.drop(columns = 'class').columns: q1 = df[col_name].quantile(0.25) q3 = df[col_name].quantile(0.75) iqr = q3 - q1 low = q1 - 1.5 * iqr high = q3 + 1.5 * iqr df.loc[(df[col_name] < low) | (df[col_name] > high), col_name] = df[col_name].median()xxxxxxxxxxplt.figure(figsize=(25, 20))col = 1for i in df.drop(columns = 'class').columns: plt.subplot(6, 3, col) sns.boxplot(df[i]) col +=1xxxxxxxxxxsns.pairplot(df, diag_kind = 'kde', hue ='class')xxxxxxxxxxcorrelation = df.corr()correlation# df.corr() computes pairwise correlation of columns. Correlation shows how two variables are related to each other. Positive values shows as one variable increases other variable increases as well. Negative values shows as one variable increases other variable decreases. Bigger values show high correlation between variables while smaller values show less correlation.Correlation shows how two variables are related to each other.
Positive values shows as one variable increases other variable increases as well.
Negative values shows as one variable increases other variable decreases.
Bigger values show high correlation between variables while smaller values show less correlation.xxxxxxxxxxplt.figure(figsize = (20, 10))plt.xticks(fontsize = 15)plt.yticks(fontsize = 15)plt.title('Correlation HeatMap', fontsize= 15)sns.heatmap(correlation, annot = True, cmap = 'viridis')Splitting the dependent and independent variables. The dependentvariable (y) is further transformed into an encoded categoricalcolumn.Splitting the dependent and independent variables. The dependent variable (y) is further transformed into an encoded categorical column.
xxxxxxxxxxX = df.loc[:, df.columns != 'class']y = df['class'].astype('category').cat.codesxxxxxxxxxxplt.figure(figsize = (20, 10))sns.barplot(x = X.columns, y = X.corrwith(y))plt.title('Correlation with Class column',fontsize = 20)# Using Robust Scaler to standardize the values of each column. This is required in order to bring the input variables on same scale,which might be on different scales in the raw form.xxxxxxxxxxX_col = X.columnsscaler = RobustScaler()X_std = pd.DataFrame(scaler.fit_transform(X))X_std.columns = X_col #the column names are passed to the standardized dataframeX_std.head()xxxxxxxxxxpca = PCA()X_pca_ = pca.fit_transform(X_std)plt.figure(figsize = (12, 8))plt.plot((np.cumsum(pca.explained_variance_ratio_) * 100), marker = 'X')plt.xlim(0, 18)plt.xlabel('Number of Components')plt.ylabel('Percentage of Cumulative Explained Variance')xxxxxxxxxxplt.figure(figsize = (12, 8))plt.step(list(range(18)), (np.cumsum(pca.explained_variance_ratio_) * 100), where ='mid')plt.xlim(0, 18)plt.xlabel('Number of Components')plt.ylabel('Percentage of Cumulative Explained Variance')plt.title('Vehicle Dataset Explained Variance')# Findings after applying PCA on the datasetWe can see that the first seven components explain morethan 95% of variation.Between first five components, more than 91% of theinformation is captured.The above plot shows almost 95% variance by the first 7components. Therefore we can drop 8th componentonwards.We can see that the first seven components explain more than 95% of variation. Between first five components, more than 91% of the information is captured. The above plot shows almost 95% variance by the first 7 components. Therefore we can drop 8th component onwards.
xxxxxxxxxxprint('Eigen Values: \n')pca.explained_variance_xxxxxxxxxxprint('Eigen Vectors: \n')pca.components_The percentage of variation explained by each Eigen VectorThe percentage of variation explained by each Eigen Vector
xxxxxxxxxxprint('The percentage of variation explained by each Eigen Vector: \n')pca.explained_variance_ratio_xxxxxxxxxx## PCA visualization how information is captured across componentspercent_variance = np.round(pca.explained_variance_ratio_* 100, decimals = 2)plt.figure(figsize = (12, 8))plt.bar(x = list(range(18)), height = percent_variance)plt.xlim(0, 18)plt.ylabel('Cumulative Explained Variance')plt.xlabel('Principal Components')xxxxxxxxxxplt.figure(figsize = (12, 8))plt.step(range(18), np.cumsum(pca.explained_variance_ratio_))plt.bar(range(18), pca.explained_variance_ratio_)plt.xlim(0, 18)plt.ylabel('Explained Variance')plt.xlabel('Principal Components')xxxxxxxxxxplt.figure(figsize = (12, 8))plt.plot(range(18), np.cumsum(pca.explained_variance_ratio_), marker = 'X')plt.plot(range(18), pca.explained_variance_ratio_, marker = 'D')plt.xlim(0, 18)plt.ylabel('Explained Variance')plt.xlabel('Principal Components')In the above graph, the blue line represents component-wiseexplained variance while the orange line represents the cumulativeexplained variance.In the above graph, the blue line represents component-wise explained variance while the orange line represents the cumulative explained variance.
xxxxxxxxxxpca_7 = PCA(n_components = 7)X_pca = pca_7.fit_transform(X_std)print('Original number of features:', X.shape[1])print('Reduced number of features:', X_pca.shape[1])xxxxxxxxxxpca_df = pd.DataFrame(data = X_pca)pca_df.head()Pairplot of PCA DatasetPairplot of PCA Dataset
xxxxxxxxxxsns.pairplot(pca_df, diag_kind = 'kde')From the pair plot analysis, it is observed that all the variables are correlated with each other.Splitting the data into training (70%) and testing set (30%).Splitting the data into training (70%) and testing set (30%).
xxxxxxxxxxX_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size = 0.3, random_state = 10)rtr, ctr = X_train.shapeprint('The training set comprises of', rtr, 'rows and', ctr, 'columns.')xxxxxxxxxxrt, ct = X_test.shapeprint('The test set comprises of', rt, 'rows and', ct, 'columns.')PCA reduced DatasetPCA reduced Dataset
xxxxxxxxxxX_tr, X_te, y_tr, y_te = train_test_split(X_pca, y, test_size = 0.3, random_state =10)rtr_pca, ctr_pca = X_tr.shapeprint('The PCA training set comprises of',rtr_pca, 'rows and', ctr_pca, 'columns.')xxxxxxxxxxrt_pca, ct_pca = X_te.shapeprint('The PCA test set comprises of', rt_pca, 'rows and', ct_pca, 'columns.')xxxxxxxxxxsvc_model = SVC(kernel = 'linear', gamma ='scale', C = 1)print('SVC Model of dataset with PCA')# fitting the modelsvc_model.fit(X_tr, y_tr)# predict the responsey_predictSVC_pca = svc_model.predict(X_te)modeltrSVC_pca_score = svc_model.score(X_tr, y_tr)print('\nAccuracy Score of Training Data:', modeltrSVC_pca_score)modelSVC_pca_score = accuracy_score(y_te,y_predictSVC_pca)print('\nAccuracy Score of Test Data:', modelSVC_pca_score)cfm_SVC_pca = confusion_matrix(y_te, y_predictSVC_pca)print('\nClassification Report of SVC Model:\n ', classification_report(y_te, y_predictSVC_pca))print('\nMean Absolute Error of SVC:\n', mean_absolute_error(y_te, y_predictSVC_pca))print('\nConfusion Matrix of SVC:\n', cfm_SVC_pca)#visualization of confusion matrix in theform of a heatmapplt.figure(figsize = (8, 6))sns.heatmap(cfm_SVC_pca, annot = True, cmap = 'YlGnBu', fmt = 'd')plt.xlabel('Actual Classes', fontsize = 15)plt.ylabel('Predicted Classes', fontsize =15)plt.title('Confusion Matrix HeatMap of SVCwith PCA Model', fontsize = 15)xxxxxxxxxxprecision_SVC_pca, recall_SVC_pca, f1_score_SVC_pca, support = precision_recall_fscore_support(y_te, y_predictSVC_pca, average= 'macro')print('Precision Score :', '%0.2f' % precision_SVC_pca)print('Recall Score :', '%0.2f' % recall_SVC_pca)print('F1-Score :', '%0.2f' % f1_score_SVC_pca)print('Accuracy Score :','%0.2f' % modelSVC_pca_score)xxxxxxxxxxprint('SVC Model of dataset without PCA')# fitting the modelsvc_model.fit(X_train, y_train)# predict the responsey_predictSVC = svc_model.predict(X_test)modeltrSVC_score = svc_model.score(X_train, y_train)print('\nAccuracy Score of Training Data:', modeltrSVC_score)cfm_SVC = confusion_matrix(y_test, y_predictSVC)# evaluate accuracymodelSVC_score = accuracy_score(y_test, y_predictSVC)print('\nAccuracy Score of Test Data:', modelSVC_score)print('\nClassification Report of SVC Model:\n ', classification_report(y_test, y_predictSVC))print('\nMean Absolute Error of SVC:\n', mean_absolute_error(y_test, y_predictSVC))print('\nConfusion Matrix of SVC:\n', cfm_SVC)#visualization of confusion matrix in theform of a heatmapplt.figure(figsize = (8, 6))sns.heatmap(cfm_SVC, annot = True, cmap ='YlGnBu', fmt = 'd')plt.xlabel('Actual Classes', fontsize = 15)plt.ylabel('Predicted Classes', fontsize =15)plt.title('Confusion Matrix HeatMap of SVCModel', fontsize = 15)xxxxxxxxxxprecision_SVC, recall_SVC, f1_score_SVC, support = precision_recall_fscore_support(y_test, y_predictSVC, average = 'macro')print('Precision Score :', '%0.2f' % precision_SVC)print('Recall Score :', '%0.2f' % recall_SVC)print('F1-Score :', '%0.2f' % f1_score_SVC)print('Accuracy Score :','%0.2f' % modelSVC_score)Dataframe showing results of models with and without PCADataframe showing results of models with and without PCA
xxxxxxxxxxmdllists = []mdllists.append(['Support Vector Classifier with PCA', modeltrSVC_pca_score * 100, modelSVC_pca_score * 100, recall_SVC_pca *100, precision_SVC_pca * 100])mdllists.append(['Support Vector Classifier without PCA', modeltrSVC_score * 100, modelSVC_score * 100, recall_SVC * 100, precision_SVC * 100])mdl_df = pd.DataFrame(mdllists, columns =['Model', 'Accuracy Score of Training Data', 'Accuracy Score of Test Data', 'RecallScore', 'Precision Score'])mdl_dfWe can see that the accuracy score of Test Data has reduced withPCA by about 13.78%.We can see that the accuracy score of Test Data has reduced with PCA by about 13.78%.
xxxxxxxxxxparam_grid_pca = [{'kernel': ['rbf'], 'C':[0.01, 0.05, 0.5, 1]}, {'kernel': ['linear'],'C': [0.01, 0.05, 0.5, 1]}]# Make grid search classifierclf_grid_pca = GridSearchCV(SVC(), param_grid_pca, verbose = 1)# Train the classifierclf_grid_pca.fit(X_tr, y_tr)print('\n\nBest Parameters:\n', clf_grid_pca.best_params_)print('\n\nBest Estimators:\n', clf_grid_pca.best_estimator_)According to Grid Search, the RBF model of SVC with C(regularization parameter) as 1According to Grid Search, the RBF model of SVC with C (regularization parameter) as 1
xxxxxxxxxxsvc_cv_pca_model = SVC(kernel = 'rbf', gamma = 'scale', C = 1)# fitting the modelsvc_cv_pca_model.fit(X_tr, y_tr)# predict the responsey_predictSVC_cv_pca = svc_cv_pca_model.predict(X_te)modeltrSVC_cv_pca_score = svc_cv_pca_model.score(X_tr, y_tr)print('\nAccuracy Score of Training Data:', modeltrSVC_cv_pca_score)modelSVC_cv_pca_score = accuracy_score(y_te, y_predictSVC_cv_pca)print('\nAccuracy Score of Test Data:', modelSVC_cv_pca_score)cfm_SVC_cv_pca = confusion_matrix(y_te, y_predictSVC_cv_pca)print('\nClassification Report of SVC Model:\n ', classification_report(y_te, y_predictSVC_cv_pca))print('\nMean Absolute Error of SVC:\n', mean_absolute_error(y_te, y_predictSVC_cv_pca))print('\nConfusion Matrix of SVC:\n', cfm_SVC_cv_pca)#visualization of confusion matrix in theform of a heatmapplt.figure(figsize = (8, 6))sns.heatmap(cfm_SVC_cv_pca, annot = True,cmap = 'YlGnBu', fmt = 'd')plt.xlabel('Actual Classes', fontsize = 15)plt.ylabel('Predicted Classes', fontsize =15)plt.title('Confusion Matrix HeatMap of SVCwith PCA Model', fontsize = 15)xxxxxxxxxxprecision_SVC_cv_pca, recall_SVC_cv_pca, f1_score_SVC_cv_pca, support = precision_recall_fscore_support(y_te, y_predictSVC_cv_pca, average = 'macro')print('Precision Score :', '%0.2f' % precision_SVC_cv_pca)print('Recall Score :', '%0.2f' % recall_SVC_cv_pca)print('F1-Score :', '%0.2f' % f1_score_SVC_cv_pca)print('Accuracy Score :','%0.2f' % modelSVC_cv_pca_score)xxxxxxxxxxparam_grid = [{'kernel': ['rbf'], 'C':[0.01, 0.05, 0.5, 1]}, {'kernel': ['linear'],'C': [0.01, 0.05, 0.5, 1]}]# Make grid search classifierclf_grid = GridSearchCV(SVC(), param_grid,verbose = 1)# Train the classifierclf_grid.fit(X_train, y_train)print('\n\nBest Parameters:\n', clf_grid.best_params_)print('\n\nBest Estimators:\n', clf_grid.best_estimator_)According to Grid Search, the RBF model of SVC with C(regularization parameter) as 1According to Grid Search, the RBF model of SVC with C (regularization parameter) as 1
xxxxxxxxxxsvc_cv_model = SVC(kernel = 'rbf', gamma ='scale', C = 1)# fitting the modelsvc_cv_model.fit(X_train, y_train)# predict the responsey_predictSVC_cv = svc_cv_model.predict(X_test)modeltrSVC_cv_score = svc_cv_model.score(X_train, y_train)print('Accuracy Score of Training Data: ',modeltrSVC_cv_score)cfm_SVC_cv = confusion_matrix(y_test, y_predictSVC_cv)# evaluate accuracymodelSVC_score_cv = accuracy_score(y_test,y_predictSVC_cv)print('\nAccuracy Score of Test Data:', modelSVC_score_cv)print('\nClassification Report of SVC Linear Model:\n ', classification_report(y_test, y_predictSVC_cv))print('\nMean Absolute Error of SVC Linear:\n', mean_absolute_error(y_test, y_predictSVC_cv))print('\nConfusion Matrix of SVC Linear:\n', cfm_SVC_cv)#visualization of confusion matrix in theform of a heatmapplt.figure(figsize = (8, 6))sns.heatmap(cfm_SVC_cv, annot = True, cmap= 'YlGnBu', fmt = 'd')plt.xlabel('Actual Classes', fontsize = 15)plt.ylabel('Predicted Classes', fontsize =15)plt.title('Confusion Matrix HeatMap of SVCGrid Search Model', fontsize = 15)xxxxxxxxxxprecision_SVC_cv, recall_SVC_cv, f1_score_SVC_cv, support = precision_recall_fscore_support(y_test, y_predictSVC_cv, average = 'macro')print('Precision Score :', '%0.2f' % precision_SVC_cv)print('Recall Score :', '%0.2f' % recall_SVC_cv)print('F1-Score :', '%0.2f' % f1_score_SVC_cv)print('Accuracy Score :','%0.2f' % modelSVC_score_cv)xxxxxxxxxxmodellsts = []modellsts.append(['Support Vector Classifier with PCA using Grid Search', modeltrSVC_cv_pca_score * 100, modelSVC_cv_pca_score * 100, recall_SVC_cv_pca * 100, precision_SVC_cv_pca * 100])modellsts.append(['Support Vector Classifier using Grid Search', modeltrSVC_cv_score * 100, modelSVC_score_cv * 100, recall_SVC_cv * 100, precision_SVC_cv * 100])md_df = pd.DataFrame(modellsts, columns = ['Model', 'Accuracy Score of Training Data', 'Accuracy Score of Test Data', 'Recall Score', 'Precision Score'])md_dfxxxxxxxxxxmodellists = []modellists.append(['Support Vector Classifier with PCA', modeltrSVC_pca_score * 100, modelSVC_pca_score * 100, recall_SVC_pca * 100, precision_SVC_pca * 100])modellists.append(['Support Vector Classifier with PCA using Grid Search', modeltrSVC_cv_pca_score * 100, modelSVC_cv_pca_score * 100, recall_SVC_cv_pca * 100, precision_SVC_cv_pca * 100])modellists.append(['Support Vector Classifier using Grid Search', modeltrSVC_cv_score * 100, modelSVC_score_cv * 100, recall_SVC_cv * 100, precision_SVC_cv * 100])modellists.append(['Support Vector Classifier without PCA', modeltrSVC_score * 100, modelSVC_score * 100, recall_SVC * 100, precision_SVC * 100])model_df = pd.DataFrame(modellists, columns = ['Model', 'Accuracy Score of Training Data', 'Accuracy Score of Test Data', 'Recall Score', 'Precision Score'])model_dfxxxxxxxxxxplt.figure(figsize = (20, 5))sns.barplot(x = model_df['Model'], y = model_df['Accuracy Score of Test Data'], data = model_df)plt.xlabel('Model', fontsize = 15)plt.ylabel('Accuracy Score', fontsize = 15)plt.title('Comparison of Classification Models', fontsize = 15)# Conclusion: Dimensional reduction using PCA is helped in this case study. It is understood that from the above score summary from the SVM classifier, original model with grid search is giving better accuracy. But the Model with PCA technique and Grid search performs equally with original model with grid search Using PCA technique, the variables are reduced from 19 to 7 without compromise the lose of information from the data. Hence Model with dimensional reduction using PCA took less computational time. Not only reducing the computational time, model performance also equally better with original model. Hence PCA plays vital role in this case study.Dimensional reduction using PCA is helped in this case study.
It is understood that from the above score summary from the SVM classifier, original model with grid search is giving better accuracy.
But the Model with PCA technique and Grid search performs equally with original model with grid search
Using PCA technique, the variables are reduced from 19 to 7 without compromise the lose of information from the data.
Hence Model with dimensional reduction using PCA took less computational time.
Not only reducing the computational time, model performance also equally better with original model.
Hence PCA plays vital role in this case study.xxxxxxxxxxxxxxxxxxxx